from pyhive import hive
import pandas as pd
import plotly.express as px
conn = hive.Connection(host='wunder9l.man.yp-c.yandex.net', port='10000')
pd.read_sql(
'select artist_lastfm, sum(scrobbles_lastfm) as total_scrobbles from hue__tmp_artists group by artist_lastfm order by total_scrobbles desc limit 10',
conn
)
| artist_lastfm | total_scrobbles | |
|---|---|---|
| 0 | Phoenix | 1099743582 |
| 1 | The Beatles | 517126254 |
| 2 | Radiohead | 499548797 |
| 3 | Coma | 447745298 |
| 4 | Coldplay | 360111850 |
| 5 | Muse | 344838631 |
| 6 | Arctic Monkeys | 332306552 |
| 7 | Exo | 314081383 |
| 8 | Pink Floyd | 313236119 |
| 9 | Ghost | 312486317 |
request = '''select tag, count(1) as cnt
from hue__tmp_artists
LATERAL VIEW explode(split(tags_lastfm, '; ')) tempTable as tag
group by tag order by cnt desc limit 10
'''
pd.read_sql(request, conn)
| tag | cnt | |
|---|---|---|
| 0 | 1083944 | |
| 1 | seen live | 99537 |
| 2 | rock | 73299 |
| 3 | electronic | 70648 |
| 4 | under 2000 listeners | 50827 |
| 5 | All | 50166 |
| 6 | pop | 48435 |
| 7 | indie | 47452 |
| 8 | alternative | 43727 |
| 9 | experimental | 40840 |
Здесь почему-то затесался пустой тег, видимо где то есть "; " на конце или другое некорректное заполнение
Для выполнения воспользуемся двумя запросами: сначала с помощью предыдущего запроса достанем теги, затем вторым запросом отфильтруем только те записи, которые содержат этот тег.
request = '''select tag, count(1) as cnt
from hue__tmp_artists
LATERAL VIEW explode(split(tags_lastfm, '; ')) tempTable as tag
group by tag order by cnt desc limit 12
'''
tags = pd.read_sql(request, conn)
top_10_tags = [f'"{x}"' for x in tags.tag[1:11]]
top_10_tags
['"seen live"', '"rock"', '"electronic"', '"under 2000 listeners"', '"All"', '"pop"', '"indie"', '"alternative"', '"experimental"', '"female vocalists"']
condition = ' or '.join(f"array_contains(split(tags_lastfm, '; '), {t})" for t in top_10_tags)
request = '''select artist_lastfm, count(1) as cnt
from hue__tmp_artists where
''' + condition + 'group by artist_lastfm order by cnt desc limit 10'
pd.read_sql(request, conn)
| artist_lastfm | cnt | |
|---|---|---|
| 0 | Karma | 26 |
| 1 | Angel | 23 |
| 2 | Nemesis | 23 |
| 3 | Eden | 22 |
| 4 | Eclipse | 21 |
| 5 | Bliss | 21 |
| 6 | Mirage | 20 |
| 7 | Odyssey | 20 |
| 8 | Indigo | 19 |
| 9 | Paradox | 19 |
Построим по странам pie-chart по популярности
request = '''SELECT country, count(1) as cnt
FROM hue__tmp_artists
LATERAL VIEW explode(split(country_lastfm, "; ")) tmpTable as country
where length(country) > 0
GROUP BY country order by cnt desc'''
df = pd.read_sql(request, conn)
# px.pie(df, )
px.pie(df[df.cnt > 10000], values='cnt', names='country', title="Страны с наибольшим количеством песен")